#------------------------------------------------------------------------------#
#
#                 _         _    _      _                _
#                (_)       | |  | |    | |              | |
#   _ __    ___   _  _ __  | |_ | |__  | |  __ _  _ __  | | __
#  | '_ \  / _ \ | || '_ \ | __|| '_ \ | | / _` || '_ \ | |/ /
#  | |_) || (_) || || | | || |_ | |_) || || (_| || | | ||   <
#  | .__/  \___/ |_||_| |_| \__||_.__/ |_| \__,_||_| |_||_|\_\
#  | |
#  |_|
#
#  This file is part of the 'rstudio/pointblank' project.
#
#  Copyright (c) 2017-2025 pointblank authors
#
#  For full copyright and license information, please look at
#  https://rstudio.github.io/pointblank/LICENSE.html
#
#------------------------------------------------------------------------------#


#' Draft a starter **pointblank** validation .R/.Rmd file with a data table
#'
#' @description
#'
#' Generate a draft validation plan in a new .R or .Rmd file using an input data
#' table. Using this workflow, the data table will be scanned to learn about its
#' column data and a set of starter validation steps (constituting a validation
#' plan) will be written. It's best to use a data extract that contains at least
#' 1000 rows and is relatively free of spurious data.
#'
#' Once in the file, it's possible to tweak the validation steps to better fit
#' the expectations to the particular domain. While column inference is used to
#' generate reasonable validation plans, it is difficult to infer the acceptable
#' values without domain expertise. However, using `draft_validation()` could
#' get you started on floor 10 of tackling data quality issues and is in any
#' case better than starting with an empty code editor view.
#'
#' @param tbl *A data table*
#'
#'   `obj:<tbl_*>` // **required**
#'
#'   The input table. This can be a data frame, tibble, a `tbl_dbi` object, or a
#'   `tbl_spark` object.
#'
#' @param tbl_name *A table name*
#'
#'   `scalar<character>` // *default:* `NULL` (`optional`)
#'
#'   A optional name to assign to the input table object. If no value is
#'   provided, a name will be generated based on whatever information is
#'   available. This table name will be displayed in the header area of the
#'   agent report generated by printing the *agent* or calling
#'   [get_agent_report()].
#'
#' @param filename *File name*
#'
#'   `scalar<character>` // *default:* `tbl_name`
#'
#'   An optional name for the .R or .Rmd file. This should be a name without an
#'   extension. By default, this is taken from the `tbl_name` but if nothing is
#'   supplied for that, the name will contain the text `"draft_validation_"`
#'   followed by the current date and time.
#'
#' @param path *File path*
#'
#'   `scalar<character>` // *default:* `NULL` (`optional`)
#'
#'   A path can be specified here if there shouldn't be an attempt to place the
#'   generated file in the working directory.
#'
#' @param lang *Commenting language*
#'
#'   `scalar<character>` // *default:* `NULL` (`optional`)
#'
#'   The language to use when creating comments for the automatically- generated
#'   validation steps. By default, `NULL` will create English (`"en"`) text.
#'   Other options include French (`"fr"`), German (`"de"`), Italian (`"it"`),
#'   Spanish (`"es"`), Portuguese (`"pt"`), Turkish (`"tr"`), Chinese (`"zh"`),
#'   Russian (`"ru"`), Polish (`"pl"`), Danish (`"da"`), Swedish (`"sv"`), and
#'   Dutch (`"nl"`).
#'
#' @param output_type *The output file type*
#'
#'   `singl-kw:[R|Rmd]` // *default:* `"R"`
#'
#'   An option for choosing what type of output should be generated. By default,
#'   this is an .R script (`"R"`) but this could alternatively be an R Markdown
#'   document (`"Rmd"`).
#'
#' @param add_comments *Add comments to the generated validation plan*
#'
#'   `scalar<logical>` // *default:* `TRUE`
#'
#'   Should there be comments that explain the features of the validation plan
#'   in the generated document?
#'
#' @param overwrite *Overwrite a previous file of the same name*
#'
#'   `scalar<logical>` // *default:* `FALSE`
#'
#'   Should a file of the same name be overwritten?
#'
#' @param quiet *Inform (or not) upon file writing*
#'
#'   `scalar<logical>` // *default:* `FALSE`
#'
#'   Should the function *not* inform when the file is written?
#'
#' @return Invisibly returns `TRUE` if the file has been written.
#'
#' @section Supported Input Tables:
#'
#' The types of data tables that are officially supported are:
#'
#'  - data frames (`data.frame`) and tibbles (`tbl_df`)
#'  - Spark DataFrames (`tbl_spark`)
#'  - the following database tables (`tbl_dbi`):
#'    - *PostgreSQL* tables (using the `RPostgres::Postgres()` as driver)
#'    - *MySQL* tables (with `RMySQL::MySQL()`)
#'    - *Microsoft SQL Server* tables (via **odbc**)
#'    - *BigQuery* tables (using `bigrquery::bigquery()`)
#'    - *DuckDB* tables (through `duckdb::duckdb()`)
#'    - *SQLite* (with `RSQLite::SQLite()`)
#'
#' Other database tables may work to varying degrees but they haven't been
#' formally tested (so be mindful of this when using unsupported backends with
#' **pointblank**).
#'
#' @section Examples:
#'
#' Let's draft a validation plan for the `dplyr::storms` dataset.
#'
#' ```{r}
#' dplyr::storms
#' ```
#'
#' The `draft_validation()` function creates an .R file by default. Using just
#' the defaults with `dplyr::storms` will yield the `"dplyr__storms.R"` file
#' in the working directory. Here are the contents of the file:
#'
#' ```r
#' library(pointblank)
#'
#' agent <-
#'   create_agent(
#'     tbl = ~ dplyr::storms,
#'     actions = action_levels(
#'       warn_at = 0.05,
#'       stop_at = 0.10
#'     ),
#'     tbl_name = "dplyr::storms",
#'     label = "Validation plan generated by `draft_validation()`."
#'   ) %>%
#'   # Expect that column `name` is of type: character
#'   col_is_character(
#'     columns = name
#'   ) %>%
#'   # Expect that column `year` is of type: numeric
#'   col_is_numeric(
#'     columns = year
#'   ) %>%
#'   # Expect that values in `year` should be between `1975` and `2020`
#'   col_vals_between(
#'     columns = year,
#'     left = 1975,
#'     right = 2020
#'   ) %>%
#'   # Expect that column `month` is of type: numeric
#'   col_is_numeric(
#'     columns = month
#'   ) %>%
#'   # Expect that values in `month` should be between `1` and `12`
#'   col_vals_between(
#'     columns = month,
#'     left = 1,
#'     right = 12
#'   ) %>%
#'   # Expect that column `day` is of type: integer
#'   col_is_integer(
#'     columns = day
#'   ) %>%
#'   # Expect that values in `day` should be between `1` and `31`
#'   col_vals_between(
#'     columns = day,
#'     left = 1,
#'     right = 31
#'   ) %>%
#'   # Expect that column `hour` is of type: numeric
#'   col_is_numeric(
#'     columns = hour
#'   ) %>%
#'   # Expect that values in `hour` should be between `0` and `23`
#'   col_vals_between(
#'     columns = hour,
#'     left = 0,
#'     right = 23
#'   ) %>%
#'   # Expect that column `lat` is of type: numeric
#'   col_is_numeric(
#'     columns = lat
#'   ) %>%
#'   # Expect that values in `lat` should be between `-90` and `90`
#'   col_vals_between(
#'     columns = lat,
#'     left = -90,
#'     right = 90
#'   ) %>%
#'   # Expect that column `long` is of type: numeric
#'   col_is_numeric(
#'     columns = long
#'   ) %>%
#'   # Expect that values in `long` should be between `-180` and `180`
#'   col_vals_between(
#'     columns = long,
#'     left = -180,
#'     right = 180
#'   ) %>%
#'   # Expect that column `status` is of type: character
#'   col_is_character(
#'     columns = status
#'   ) %>%
#'   # Expect that column `category` is of type: factor
#'   col_is_factor(
#'     columns = category
#'   ) %>%
#'   # Expect that column `wind` is of type: integer
#'   col_is_integer(
#'     columns = wind
#'   ) %>%
#'   # Expect that values in `wind` should be between `10` and `160`
#'   col_vals_between(
#'     columns = wind,
#'     left = 10,
#'     right = 160
#'   ) %>%
#'   # Expect that column `pressure` is of type: integer
#'   col_is_integer(
#'     columns = pressure
#'   ) %>%
#'   # Expect that values in `pressure` should be between `882` and `1022`
#'   col_vals_between(
#'     columns = pressure,
#'     left = 882,
#'     right = 1022
#'   ) %>%
#'   # Expect that column `tropicalstorm_force_diameter` is of type: integer
#'   col_is_integer(
#'     columns = tropicalstorm_force_diameter
#'   ) %>%
#'   # Expect that values in `tropicalstorm_force_diameter` should be between
#'   # `0` and `870`
#'   col_vals_between(
#'     columns = tropicalstorm_force_diameter,
#'     left = 0,
#'     right = 870,
#'     na_pass = TRUE
#'   ) %>%
#'   # Expect that column `hurricane_force_diameter` is of type: integer
#'   col_is_integer(
#'     columns = hurricane_force_diameter
#'   ) %>%
#'   # Expect that values in `hurricane_force_diameter` should be between
#'   # `0` and `300`
#'   col_vals_between(
#'     columns = hurricane_force_diameter,
#'     left = 0,
#'     right = 300,
#'     na_pass = TRUE
#'   ) %>%
#'   # Expect entirely distinct rows across all columns
#'   rows_distinct() %>%
#'   # Expect that column schemas match
#'   col_schema_match(
#'     schema = col_schema(
#'       name = "character",
#'       year = "numeric",
#'       month = "numeric",
#'       day = "integer",
#'       hour = "numeric",
#'       lat = "numeric",
#'       long = "numeric",
#'       status = "character",
#'       category = c("ordered", "factor"),
#'       wind = "integer",
#'       pressure = "integer",
#'       tropicalstorm_force_diameter = "integer",
#'       hurricane_force_diameter = "integer"
#'     )
#'   ) %>%
#'   interrogate()
#'
#' agent
#' ```
#'
#' This is runnable as is, and the promise is that the interrogation should
#' produce no failing test units. After execution, we get the following
#' validation report:
#'
#' \if{html}{
#' \out{
#' `r pb_get_image_tag(file = "man_draft_validation_1.png")`
#' }
#' }
#'
#' All of the expressions in the resulting file constitute just a rough
#' approximation of what a validation plan should be for a dataset. Certainly,
#' the value ranges in the emitted [col_vals_between()] may not be realistic for
#' the `wind` column and may require some modification (the provided `left` and
#' `right` values are just the limits of the provided data). However, note that
#' the `lat` and `long` (latitude and longitude) columns have acceptable ranges
#' (providing the limits of valid lat/lon values). This is thanks to
#' **pointblank**'s column inference routines, which is able to understand what
#' certain columns contain.
#'
#' For an evolving dataset that will experience changes (either in the form of
#' revised data and addition/deletion of rows or columns), the emitted
#' validation will serve as a good first step and changes can more easily be
#' made since there is a foundation to build from.
#'
#'
#' @family Planning and Prep
#' @section Function ID:
#' 1-11
#'
#' @export
draft_validation <- function(
    tbl,
    tbl_name = NULL,
    filename = tbl_name,
    path = NULL,
    lang = NULL,
    output_type = c("R", "Rmd"),
    add_comments = TRUE,
    overwrite = FALSE,
    quiet = FALSE
) {

  output_type <- match.arg(output_type)

  tbl_material <- materialize_table(tbl = tbl)

  column_roles <- get_column_roles(data = tbl_material)
  column_names <- colnames(tbl_material)

  agent <-
    create_agent(
      tbl = tbl,
      tbl_name = tbl_name,
      label = "Validation plan generated by `draft_validation()`.",
      actions = action_levels(warn_at = 0.05, stop_at = 0.10),
      lang = lang
    )

  agent$tbl <- tbl_material

  # Add column-based validation steps to the agent on
  # the basis of column roles
  for (i in seq_along(column_roles)) {

    agent <-
      add_valdn_steps_with_role(
        agent = agent,
        column = column_names[i],
        column_role = column_roles[i]
      )
  }

  # Add the `rows_distinct()` validation step if all rows in the
  # table are distinct
  total_rows <- get_table_total_rows(data = tbl_material)
  distinct_rows <- get_table_total_distinct_rows(data = tbl_material)

  if (distinct_rows == total_rows) {
    agent <- rows_distinct(agent)
  }

  # Add the `col_schema_match()` validation step
  agent <- col_schema_match(agent, schema = col_schema(.tbl = tbl_material))

  # Get the `read_fn` text from `tbl`
  read_fn_name <- deparse(match.call()$tbl)
  read_fn_name <- gsub("^\\s+", "", read_fn_name)
  read_fn_name <- paste(read_fn_name, collapse = "")

  if (is.null(tbl_name) && !grepl("\\s", read_fn_name)) {
    tbl_name <- read_fn_name
  }

  if (read_fn_name == ".") {
    read_fn_name <- NULL
  }

  if (filename == ".") {
    filename <- NULL
  }

  # Create the filename for the pointblank file
  file_name <-
    resolve_file_filename(
      agent = agent,
      name = filename,
      output_type = output_type
    )

  if (is.null(path)) {
    file_path <- "."

  } else {

    if (!fs::dir_exists(path)) {
      # Stop function if the path doesn't exist and inform user
      # that this function won't create a path
      stop(
        "The provided `path` does not exist:\n",
        "* Please create the path",
        call. = FALSE
      )
    }

    file_path <- path
  }

  # Create path that contains the file
  path <- as.character(fs::path_norm(fs::path_wd(file_path, file_name)))

  # Check if the file to write already exists; if it does, don't
  # write the new file if `overwrite` is FALSE
  if (fs::file_exists(path) && !overwrite) {
    stop(
      "A file of the same name already exists:\n",
      "* set `overwrite` to `TRUE`, or\n",
      "* choose a different `file_name`, or\n",
      "* define another `path` for the file",
      call. = FALSE
    )
  }

  # Set a temporary `read_fn` value if one doesn't exist in the agent
  if (is.null(agent$read_fn)) {
    agent$read_fn <- ""
  }

  # Extract all briefs from the validation steps
  briefs <- agent$validation_set$brief

  # Extract all R expressions for the file
  agent_exprs <- agent_get_exprs(agent = agent, expanded = TRUE)

  agent_exprs <-
    gsub(
      "tbl = ,\n",
      paste0(
        "tbl = ~ ",
        ifelse(
          is.null(read_fn_name),
          "CODE_TO_ACCESS_TABLE, # <- Add R code that obtains the data table",
          read_fn_name),
        ",\n"
      ),
      agent_exprs
    )

  agent_exprs <-
    gsub(
      "stop_at = 0.1",
      "stop_at = 0.10",
      agent_exprs
    )

  agent_exprs <-
    gsub(
      "  tbl_name = \".*?\",",
      paste0(
        "  tbl_name = ",
        ifelse(
          is.null(tbl_name) || tbl_name == ".",
          "NULL, # <- Optionally add in the table name",
          paste0("\"", tbl_name, "\",")
        )
      ),
      agent_exprs
    )

  agent_expr_vec <- unlist(strsplit(agent_exprs, " %>%\n", fixed = TRUE))

  if (add_comments) {

    agent_expr_vec_2 <-
      paste(
        paste0("%>%\n  # ", gsub("\\. $", "", briefs), "\n"),
        paste0(" ", gsub("\n", "\n  ", agent_expr_vec[-1])),
        collapse = "  "
      )

  } else {

    agent_expr_vec_2 <-
      paste(
        paste0("%>%\n"),
        paste0(" ", gsub("\n", "\n  ", agent_expr_vec[-1])),
        collapse = "  "
      )
  }

  agent_lines <-
    paste(
      paste0("agent <-\n  ", gsub("\n", "\n  ", agent_expr_vec[1])),
      agent_expr_vec_2,
      collapse = ""
    )

  if (output_type == "R") {

    file_content <-
      paste0(
        "library(pointblank)\n\n",
        agent_lines,
        "%>%\n  interrogate()\n\nagent",
        collapse = ""
      ) %>%
      gsub("  %>%", " %>%", .)

  } else {

    file_content <-
      paste0(
        "---\n",
        "title: \"",
        ifelse(is.null(tbl_name) || tbl_name == ".", "Untitled", tbl_name),
        "\"\n",
        "output: html_document\n",
        "---\n",
        "\n",
        "```{r setup, include=FALSE}\n",
        "knitr::opts_chunk$set(echo = TRUE)\n",
        "library(pointblank)\n",
        "```\n",
        "\n\n",
        "```{r create_agent, echo=TRUE}\n",
        agent_lines,
        "%>%\n  interrogate()\n",
        "```\n",
        "\n\n",
        "```{r print_agent, echo=FALSE}\n",
        "agent\n",
        "```\n",
        collapse = ""
      ) %>%
      gsub("  %>%", " %>%", .)
  }

  # Write the file to the resulting `path`
  pb_write_file(
    path = path,
    lines = file_content,
    append = FALSE
  )

  # Generate cli message
  if (!quiet) {
    cli_bullet_msg(
      msg = paste0(
        "The pointblank .", output_type, " file has been written to `{path}`"
        ),
      bullet = cli::symbol$tick,
      color = "green"
    )
  }

  invisible(TRUE)
}

add_valdn_steps_with_role <- function(agent, column, column_role) {

  if (grepl("string", column_role)) {

    if (inherits(agent$tbl, "data.frame") &&
        is.factor(agent$tbl[[column]])) {

      agent <- col_is_factor(agent, columns = {{ column }})
    } else {
      agent <- col_is_character(agent, columns = {{ column }})
    }
  }

  if (column_role == "integer.discrete") {
    agent <- col_is_integer(agent, columns = {{ column }})
  }

  if (column_role == "boolean.logical.categorical") {
    agent <- col_is_logical(agent, columns = {{ column }})
  }

  if (column_role == "country:iso3166-1-esn.string.categorical") {

    country_names <-
      dplyr::pull(
        get_non_null_col_sample(
          data_column = dplyr::select(agent$tbl, {{ column }}),
          sample_n = 2E8,
          make_distinct = TRUE
        )
      )

    missing_values_column <-
      get_table_total_missing_values(
        data = dplyr::select(agent$tbl, {{ column }})
      )

    if (missing_values_column > 0) {
      country_names <- c(NA_character_, country_names)
    }

    agent <-
      col_vals_in_set(
        agent,
        columns = {{ column }},
        set = country_names
      )
  }

  if (column_role == "country:iso3166-1-a-2.string.categorical") {

    alpha_2 <- countries$alpha_2

    missing_values_column <-
      get_table_total_missing_values(
        data = dplyr::select(agent$tbl, {{ column }})
      )

    if (missing_values_column > 0) {
      alpha_2 <- c(NA_character_, alpha_2)
    }

    agent <-
      col_vals_in_set(
        agent,
        columns = {{ column }},
        set = alpha_2
      )
  }

  if (column_role == "country:iso3166-1-a-3.string.categorical") {

    alpha_3 <- countries$alpha_3

    missing_values_column <-
      get_table_total_missing_values(
        data = dplyr::select(agent$tbl, {{ column }})
      )

    if (missing_values_column > 0) {
      alpha_3 <- c(NA_character_, alpha_3)
    }

    agent <-
      col_vals_in_set(
        agent,
        columns = {{ column }},
        set = alpha_3
      )
  }

  if (grepl("country_subd:iso3166-2\\[...\\].string", column_role)) {

    country <- gsub("(^.*\\[|\\].*$)", "", column_role)

    subd_2 <- subd_list_main[[country]]

    missing_values_column <-
      get_table_total_missing_values(
        data = dplyr::select(agent$tbl, {{ column }})
      )

    if (missing_values_column > 0) {
      subd_2 <- c(NA_character_, subd_2)
    }

    agent <-
      col_vals_in_set(
        agent,
        columns = {{ column }},
        set = subd_2
      )
  }

  if (grepl("numeric", column_role)) {

    agent <-
      col_is_numeric(
        agent,
        columns = {{ column }}
      )
  }

  if (column_role %in% c(
    "numeric.continuous", "numeric.discrete", "numeric", "integer.discrete"
  )) {

    missing_values_column <-
      get_table_total_missing_values(
        data = dplyr::select(agent$tbl, {{ column }})
      )

    summary_list <-
      get_table_column_summary(
        data_column = dplyr::select(agent$tbl, {{ column }}),
        round = Inf
      )

    agent <-
      col_vals_between(
        agent, columns = {{ column }},
        left = summary_list$min, right = summary_list$max,
        na_pass = missing_values_column > 0
      )
  }

  if (grepl("geo:latitude.numeric", column_role)) {

    missing_values_column <-
      get_table_total_missing_values(
        data = dplyr::select(agent$tbl, {{ column }})
      )

    agent <-
      col_vals_between(
        agent, columns = {{ column }},
        left = -90, right = 90,
        na_pass = missing_values_column > 0
      )
  }

  if (grepl("geo:longitude.numeric", column_role)) {

    missing_values_column <-
      get_table_total_missing_values(
        data = dplyr::select(agent$tbl, {{ column }})
      )

    agent <-
      col_vals_between(
        agent, columns = {{ column }},
        left = -180, right = 180,
        na_pass = missing_values_column > 0
      )
  }

  agent
}


resolve_file_filename <- function(agent,
                                  name,
                                  output_type) {

  if (is.null(name)) {

    sys_time <- format(Sys.time(), format = "%Y_%m_%d_%I_%M_%p")
    file_name <-
      paste0("draft_validation_", sys_time, ".", output_type)

  } else {

    if (!is.character(name)) {
      stop(
        "The value supplied to `name` must be of class 'character'.",
        call. = FALSE
      )
    }

    # Handle special case of `pkg::dataset` before sanitization
    if (!grepl("\\s", name) && grepl("::", name)) {
      name <- gsub("::", "__", name, fixed = TRUE)
    }

    file_name <-
      name[1] %>%
      fs::path_sanitize() %>%
      gsub("(\\.| |'|\\:)", "_", .) %>%
      paste0(., ".", output_type)
  }

  file_name
}
